Objectives:

Exploratory Data Analysis

Exploratory Data Analysis is about: - univariate non-graphical EDA - univariate graphical EDA - multivariate non-graphical EDA - multivariate graphical EDA

Methods to explore:
- Variation (describe behavior within variable)
- Covariation (describe behavior between variables)
- Missing data / anomalies
- Patterns / correlations

Tools: - tables
- scatterplot
- histogram
- boxplot
- heatplot and hex plots

Pull in new dataset.

Dataset from Department of Labor about applications for permanent employment certification. Download the “PERM” FY 2021 Disclosure file (xlsx) and put in folder /lessons/data.

library(tidyverse)
library(readxl)
library(gt)
library(gtExtras) #remotes::install_github("jthomasmock/gtExtras")
# in some cases, we need to change guess_max because the column may have nulls for a lot of the first rows
df = read_excel("lessons/data/PERM_Disclosure_Data_FY2021_Q3.xlsx",guess_max=15000)
head(df,2) %>% as.data.frame()

Do general data checks to understand the data you are dealing with.

# validate if all cases are distinct
df %>% distinct(CASE_NUMBER) %>% nrow() == nrow(df)
## [1] TRUE
# look at received yr
df %>% mutate(received_yr=lubridate::year(RECEIVED_DATE)) %>% group_by(received_yr) %>% count() %>% ungroup()
# look at decision yr
df %>% mutate(decision_yr=lubridate::year(DECISION_DATE)) %>% group_by(decision_yr) %>% count() %>% ungroup()
# look at employers
df %>% group_by(EMPLOYER_NAME) %>% count() %>% ungroup() %>% arrange(desc(n))

Now, we can make some interesting graphics / tables to describe trends.

summary(df$PW_WAGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       7   67413   94307   91555  116334 6169300      10
wages = df %>%
    group_by(PW_WAGE) %>% count() %>% ungroup()

wages %>%
    ggplot(aes(x=PW_WAGE,y=n)) + geom_bar(stat='identity')
## Warning: Removed 1 rows containing missing values (position_stack).

wages %>%
  ggplot(aes(x=PW_WAGE,y=n)) + geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).

df %>%
  group_by(PW_WAGE) %>% count() %>% ungroup() %>%
  ggplot(aes(x=PW_WAGE,y=n)) + geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).

df %>%
  group_by(PW_WAGE) %>% count() %>% ungroup() %>%
  ggplot(aes(x=PW_WAGE,y=n)) + geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).

top_10_employers = df %>% group_by(EMPLOYER_NAME) %>% count() %>% ungroup() %>% arrange(desc(n)) %>% head(10)

df_top_10 = df %>% dplyr::filter(EMPLOYER_NAME %in% top_10_employers$EMPLOYER_NAME)

gg_all = df %>% 
  ggplot(aes(x=PW_WAGE)) +
  geom_histogram(breaks=seq(0,300000,by=10000),fill="black",colour="white") +
  scale_x_continuous(name="Wage",breaks=seq(0,300000,by=50000),labels=scales::comma) +
  scale_y_continuous(name="Applicants w/ Wage",labels=scales::comma) +
  theme_bw() +
  ggtitle("Wages per Applicant (2020-2021)")

gg_all
## Warning: Removed 10 rows containing non-finite values (stat_bin).

gg_top10 = df %>% 
  dplyr::filter(EMPLOYER_NAME %in% top_10_employers$EMPLOYER_NAME) %>%
  ggplot(aes(x=PW_WAGE)) +
  geom_histogram(breaks=seq(0,300000,by=10000),fill="black",colour="white") +
  scale_x_continuous(name="Wage",breaks=seq(0,300000,by=50000),labels=scales::comma) +
  scale_y_continuous(name="Applicants w/ Wage",labels=scales::comma) +
  theme_bw() +
  ggtitle("Wages per Applicant (2020-2021)")
  
ggplot() +
  geom_histogram(data = df,mapping=aes(x=PW_WAGE),breaks=seq(0,300000,by=10000),fill="grey",colour="white") +
  geom_histogram(data = df_top_10,mapping=aes(x=PW_WAGE),breaks=seq(0,300000,by=10000),fill="black",colour="white") +
  scale_x_continuous(name="Wage",breaks=seq(0,300000,by=50000),labels=scales::comma) +
  scale_y_continuous(name="Applicants w/ Wage",labels=scales::comma) +
  theme_bw() +
  ggtitle(label = "Wages per Applicant (2020-2021)", subtitle = "Black represents top 10 employers")
## Warning: Removed 10 rows containing non-finite values (stat_bin).

df_top_10 = df %>% dplyr::filter(EMPLOYER_NAME %in% top_10_employers$EMPLOYER_NAME)

top_paid = df %>% arrange(desc(PW_WAGE)) %>% top_frac(0.1,wt = PW_WAGE) %>% 
  group_by(EMPLOYER_NAME) %>% summarise(n=n(),sum=sum(PW_WAGE,na.rm=TRUE),max=max(PW_WAGE,na.rm=TRUE),med=stats::median(PW_WAGE,na.rm=TRUE)) %>% ungroup() %>% 
  arrange(desc(n)) %>% head(10)

df %>% dplyr::filter(EMPLOYER_NAME %in% top_paid$EMPLOYER_NAME)
# gtExtras and gt_plt_bullet() is a new package/function and the syntax may change in the future
top_paid %>% 
  dplyr::mutate(plot_column = max) %>%
  gt() %>% gt_plt_bullet(column=plot_column,target=med,width=45,colors= c("lightblue","black")) %>%
  gt::cols_label(EMPLOYER_NAME=md("Employer"),n=md("Count"),sum=md("Sum"),max=md("Wage")) %>% gt::fmt_number(columns=c(sum,max),decimals=0)
Employer Count Sum Wage plot_column
TATA CONSULTANCY SERVICES LIMITED 530 72,837,215 138,861
MICROSOFT CORPORATION 262 40,226,825 189,613
NVIDIA CORPORATION 146 23,867,999 252,117
APPLE INC. 126 19,827,183 215,093
ORACLE AMERICA, INC. 115 17,879,882 206,407
Adobe Inc. 96 15,659,525 238,285
AMAZON.COM SERVICES LLC 92 13,775,819 172,640
SALESFORCE.COM 78 12,103,522 231,088
THE BOSTON CONSULTING GROUP 75 10,729,576 162,094
CISCO SYSTEMS, INC. 69 10,266,896 191,194

Resources